{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Scrape the MBFC Dataset, Including Granular Bias Ratings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "from bs4 import BeautifulSoup\n",
    "from tqdm import tqdm\n",
    "from tenacity import retry, stop_after_attempt, wait_fixed\n",
    "import tldextract\n",
    "import csv\n",
    "from concurrent.futures import ThreadPoolExecutor, as_completed\n",
    "import pandas as pd\n",
    "import re\n",
    "from string import punctuation\n",
    "\n",
    "categories = [\n",
    "    \"center\",\n",
    "    \"left\",\n",
    "    \"leftcenter\",\n",
    "    \"right-center\",\n",
    "    \"right\",\n",
    "    \"conspiracy\",\n",
    "    \"fake-news\",\n",
    "    \"pro-science\",\n",
    "]\n",
    "\n",
    "def get_all_links(link):\n",
    "    page = requests.get(link)\n",
    "    soup = BeautifulSoup(page.content, \"html.parser\")\n",
    "    extract = soup.select(\"#mbfc-table a\")\n",
    "    hrefs = [x[\"href\"] for x in extract]\n",
    "    return hrefs\n",
    "\n",
    "@retry(stop=stop_after_attempt(5), wait=wait_fixed(1))\n",
    "def get_granular_ratings(link):\n",
    "    result = {}\n",
    "    page = requests.get(link)\n",
    "    soup = BeautifulSoup(page.content, \"html.parser\")\n",
    "    main = soup.select(\"#main-content .clearfix\")\n",
    "    imgs = main[0].select(\"img\")\n",
    "    granular_rating = imgs[0][\"src\"]\n",
    "    paragraphs = soup.find_all(\"p\")\n",
    "    for p in paragraphs:\n",
    "        if \"Bias Rating:\" in p.get_text():\n",
    "            extract_text = p.get_text(separator=\" \")\n",
    "            lines = extract_text.strip().split(\"\\n\")\n",
    "            result = {}\n",
    "            for line in lines:\n",
    "                if \":\" in line:\n",
    "                    key, value = line.split(\":\", 1)\n",
    "                    result[key.strip()] = value.strip()\n",
    "            break\n",
    "    text = soup.select(\"p\")\n",
    "    source_link = [x.text for x in text if \"Source:\" in x.text]\n",
    "    if \": \" in source_link[0]:\n",
    "        link = source_link[0].split(\": \")[1]\n",
    "    elif \":\\xa0\" in source_link[0]:\n",
    "        link = source_link[0].split(\":\\xa0\")[1]\n",
    "\n",
    "    result[\"source_link\"] = link\n",
    "    result[\"source_domain\"] = tldextract.extract(link).domain\n",
    "    result[\"granular_rating\"] = granular_rating\n",
    "    return result\n",
    "\n",
    "\n",
    "links = {}\n",
    "for category in tqdm(categories):\n",
    "    links[category] = get_all_links(f\"https://mediabiasfactcheck.com/{category}/\")\n",
    "\n",
    "data = []\n",
    "error_count = {}\n",
    "\n",
    "def process_link(link, category):\n",
    "    try:\n",
    "        result = get_granular_ratings(link)\n",
    "        result[\"category\"] = category\n",
    "        data.append(result)\n",
    "    except Exception as e:\n",
    "        if category not in error_count:\n",
    "            error_count[category] = []\n",
    "        error_count[category].append([link, str(e)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "with ThreadPoolExecutor(max_workers=50) as executor:\n",
    "    futures = []\n",
    "    for category, link_list in links.items():\n",
    "        for link in link_list:\n",
    "            futures.append(executor.submit(process_link, link, category))\n",
    "\n",
    "    for future in tqdm(as_completed(futures), total=len(futures)):\n",
    "        pass\n",
    "\n",
    "    for category in categories:\n",
    "        print(f\"Errors in {category}: {error_count.get(category, 0)}\")\n",
    "\n",
    "for category in error_count:\n",
    "    error_count[category] = list(set(tuple(error) for error in error_count[category]))\n",
    "\n",
    "print(\"Running error links again...\")\n",
    "error_links = []\n",
    "for category, errors in error_count.items():\n",
    "    for error in errors:\n",
    "        error_links.append((error[0], category))\n",
    "\n",
    "with ThreadPoolExecutor(max_workers=50) as executor:\n",
    "    futures = []\n",
    "    for link, category in error_links:\n",
    "        futures.append(executor.submit(process_link, link, category))\n",
    "\n",
    "    for future in tqdm(as_completed(futures), total=len(futures)):\n",
    "        pass\n",
    "\n",
    "    for category in categories:\n",
    "        print(f\"Remaining errors in {category}: {len(error_count.get(category, [])) }\")\n",
    "\n",
    "\n",
    "unique_data = []\n",
    "unique_links = set()\n",
    "\n",
    "for item in data:\n",
    "    source_link = item[\"source_link\"]\n",
    "    if source_link not in unique_links:\n",
    "        unique_data.append(item)\n",
    "        unique_links.add(source_link)\n",
    "\n",
    "\n",
    "data = unique_data\n",
    "\n",
    "\n",
    "def clean_name(x):\n",
    "    x = (\n",
    "        x.lower()\n",
    "        .translate(str.maketrans(\"\", \"\", punctuation))\n",
    "        .replace(\"rank\", \"rating\")\n",
    "        .replace(\"’\", \"\")\n",
    "        .strip()\n",
    "    )\n",
    "    x = re.sub(\" +\", \" \", x)\n",
    "    x = x.replace(\" \", \"_\")\n",
    "    return x\n",
    "\n",
    "\n",
    "column_mapping = {\n",
    "    \"mbfcscountry_freedom_rating\": \"mbfc_country_freedom_rating\",\n",
    "    \"mbfcs_countryfreedom_rating\": \"mbfc_country_freedom_rating\",\n",
    "    \"mbfcs_press_freedom_rating\": \"mbfc_country_freedom_rating\",\n",
    "    \"mbrfcs_country_freedom_rating\": \"mbfc_country_freedom_rating\",\n",
    "    \"world_press_freedom_rating\": \"mbfc_country_freedom_rating\",\n",
    "    \"press_freedom_rating\": \"mbfcs_country_freedom_rating\",\n",
    "    \"mbfcs_freedom_rating\": \"mbfc_country_freedom_rating\",\n",
    "    \"press_freedom_rating\": \"mbfc_country_freedom_rating\",\n",
    "    \"mbfcs_country_freedom_rating\": \"mbfc_country_freedom_rating\",\n",
    "    \"credibility\": \"mbfc_credibility_rating\",\n",
    "    \"mbfcs_county_freedom_rating\": \"mbfc_country_freedom_rating\",\n",
    "    \"mbfcs_country_freedom_profile\": \"mbfc_country_freedom_rating\",\n",
    "    \"mbgcs_country_freedom_rating\": \"mbfc_country_freedom_rating\",\n",
    "    \"country_press_freedom_rating\": \"mbfc_country_freedom_rating\",\n",
    "    \"mbfcs_countyry_freedom_rating\": \"mbfc_country_freedom_rating\",\n",
    "}\n",
    "\n",
    "data_clean = [{clean_name(k): v for k, v in entry.items()} for entry in data]\n",
    "data_clean = [\n",
    "    {column_mapping.get(k, k): v for k, v in entry.items()} for entry in data_clean\n",
    "]\n",
    "df = pd.DataFrame(data_clean)\n",
    "# Export to CSV\n",
    "df.to_csv(\"mediabiasfactcheck_fulldataset.csv\", index=False)\n",
    "\n",
    "df[\"granularrating_cat\"] = df[\"granularrating\"].apply(\n",
    "    lambda x: x.split(\"/\")[-1].split(\".\")[0]\n",
    ")\n",
    "\n",
    "df[\"granularrating_cat\"].apply(lambda x: re.sub(r\"\\d+\", \"\", x)).value_counts()\n",
    "political_categories = [\n",
    "    \"extremeright\",\n",
    "    \"right\",\n",
    "    \"rightcenter\",\n",
    "    \"leastbiased\",\n",
    "    \"leftcenter\",\n",
    "    \"left\",\n",
    "    \"extremeleft\",\n",
    "]\n",
    "df[\"political\"] = df[\"granularrating_cat\"].str.contains(\"|\".join(political_categories))\n",
    "df[\"level\"] = df[\"granularrating_cat\"].apply(\n",
    "    lambda x: x.replace(re.sub(r\"\\d+\", \"\", x), \"\")\n",
    ")\n",
    "df[\"granularrating_nolevel\"] = df[\"granularrating_cat\"].apply(\n",
    "    lambda x: re.sub(r\"\\d+\", \"\", x)\n",
    ")\n",
    "df[\"level\"] = df[\"level\"].apply(lambda x: x[0:2] if len(x) > 2 else x)\n",
    "\n",
    "df[df.granularrating_nolevel == \"extremeleft\"].groupby(\n",
    "    \"level\"\n",
    ").granularrating.unique().values\n",
    "df[\"granularrating_nolevel\"] = df[\"granularrating_cat\"].apply(\n",
    "    lambda x: re.sub(r\"\\d+\", \"\", x)\n",
    ")\n",
    "df[\"level\"] = df[\"granularrating_cat\"].apply(\n",
    "    lambda x: x.replace(re.sub(r\"\\d+\", \"\", x), \"\")\n",
    ")\n",
    "df[\"level\"] = df[\"level\"].apply(lambda x: x[:2] if len(x) > 2 else x)\n",
    "\n",
    "# Map the political levels from -36 to +36\n",
    "political_mapping = {\n",
    "    \"extremeleft\": -30,\n",
    "    \"left\": -20,\n",
    "    \"leftcenter\": -10,\n",
    "    \"leastbiased\": 0,\n",
    "    \"rightcenter\": 10,\n",
    "    \"right\": 20,\n",
    "    \"extremeright\": 30,\n",
    "}\n",
    "\n",
    "\n",
    "def map_political_level(row):\n",
    "    if row[\"granularrating_nolevel\"] not in political_mapping:\n",
    "        return None\n",
    "    category = row[\"granularrating_nolevel\"]\n",
    "    level = int(row[\"level\"])\n",
    "\n",
    "    if category in [\"extremeleft\", \"extremeright\"]:\n",
    "        level_mapping = {1: 6, 2: 5, 3: 4, 4: 3, 5: 2, 6: 1}\n",
    "    else:\n",
    "        level_mapping = {\n",
    "            1: -6,\n",
    "            2: -5,\n",
    "            3: -4,\n",
    "            4: -3,\n",
    "            5: -2,\n",
    "            6: -1,\n",
    "            7: 1,\n",
    "            8: 2,\n",
    "            9: 3,\n",
    "            10: 4,\n",
    "            11: 5,\n",
    "            12: 6,\n",
    "        }\n",
    "\n",
    "    return political_mapping[category] + level_mapping[level]\n",
    "\n",
    "\n",
    "df[\"political_level\"] = df.apply(map_political_level, axis=1)\n",
    "\n",
    "df = df.drop(\n",
    "    columns=[\n",
    "        \"granularrating_nolevel\",\n",
    "        \"level\",\n",
    "        \"granularrating_cat\",\n",
    "        \"questionable_reasoning\",\n",
    "        \"bias\",\n",
    "        \"reasoning\",\n",
    "    ]\n",
    ")\n",
    "df.to_csv(\"mediabiasfactcheck_fulldataset.csv\", index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Repeat for Misinformation, Fake-News"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 1523/1523 [02:16<00:00, 11.15it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Remaining errors in center: 0\n",
      "Remaining errors in left: 0\n",
      "Remaining errors in leftcenter: 0\n",
      "Remaining errors in right-center: 0\n",
      "Remaining errors in right: 0\n",
      "Remaining errors in conspiracy: 0\n",
      "Remaining errors in fake-news: 593\n",
      "Remaining errors in pro-science: 0\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "links = get_all_links(\"https://mediabiasfactcheck.com/fake-news/\")\n",
    "links = [(link, \"fake-news\") for link in links]\n",
    "data = []\n",
    "error_count = {}\n",
    "\n",
    "with ThreadPoolExecutor(max_workers=50) as executor:\n",
    "    futures = []\n",
    "    for link, category in links:\n",
    "        futures.append(executor.submit(process_link, link, category))\n",
    "    for future in tqdm(as_completed(futures), total=len(futures)):\n",
    "        pass\n",
    "    for category in categories:\n",
    "        print(f\"Remaining errors in {category}: {len(error_count.get(category, [])) }\")\n",
    "\n",
    "# Repeat for error links\n",
    "error_links = []\n",
    "for category, errors in error_count.items():\n",
    "    for error in errors:\n",
    "        error_links.append((error[0], category))\n",
    "\n",
    "with ThreadPoolExecutor(max_workers=50) as executor:\n",
    "    futures = []\n",
    "    for link, category in error_links:\n",
    "        futures.append(executor.submit(process_link, link, category))\n",
    "    for future in tqdm(as_completed(futures), total=len(futures)):\n",
    "        pass\n",
    "    for category in categories:\n",
    "        print(f\"Remaining errors in {category}: {len(error_count.get(category, [])) }\")\n",
    "\n",
    "d = pd.DataFrame(data)\n",
    "d[~d.source_link.duplicated()].to_csv(\"../Data/mediabiasfactcheck_fakenews.csv\", index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
